# SEE modeldata package for new datasets
library(tidyverse) # for graphing and data cleaning
library(tidymodels) # for modeling
library(stacks) # for stacking models
library(naniar) # for examining missing values (NAs)
library(lubridate) # for date manipulation
library(moderndive) # for King County housing data
library(DALEX) # for model interpretation
library(DALEXtra) # for extension of DALEX
library(patchwork) # for combining plots nicely
library(dbplyr) # for SQL query "cheating" - part of tidyverse but needs to be loaded separately
library(mdsr) # for accessing some databases - goes with Modern Data Science with R textbook
library(RMySQL) # for accessing MySQL databases
library(RSQLite) # for accessing SQLite databases
#mapping
library(maps) # for built-in maps
library(sf) # for making maps using geom_sf
library(ggthemes) # Lisa added - I like theme_map() for maps :)
#tidytext
library(tidytext) # for text analysis, the tidy way!
library(textdata)
library(reshape2)
library(wordcloud) # for wordcloud
library(stopwords)
theme_set(theme_minimal()) # Lisa's favorite theme
When you finish the assignment, remove the # from the options chunk at the top, so that messages and warnings aren’t printed. If you are getting errors in your code, add error = TRUE so that the file knits. I would recommend not removing the # until you are completely finished.
From now on, GitHub should be part of your routine when doing assignments. I recommend making it part of your process anytime you are working in R, but I’ll make you show it’s part of your process for assignments.
Task: When you are finished with the assignment, post a link below to the GitHub repo for the assignment. If you want to post it to your personal website, that’s ok (not required). Make sure the link goes to a spot in the repo where I can easily find this assignment. For example, if you have a website with a blog and post the assignment as a blog post, link to the post’s folder in the repo. As an example, I’ve linked to my GitHub stacking material here.
You are going to use the King County house data and the same random forest model to predict log_price that I used in the tutorial.
Tasks:
data("house_prices")
# Create log_price and drop price variable
house_prices <- house_prices %>%
mutate(log_price = log(price, base = 10)) %>%
# make all integers numeric ... fixes prediction problem
mutate(across(where(is.integer), as.numeric)) %>%
select(-price)
set.seed(327) #for reproducibility
# Randomly assigns 75% of the data to training.
house_split <- initial_split(house_prices,
prop = .75)
house_training <- training(house_split)
house_testing <- testing(house_split)
#Recreate the random forest model:
# set up recipe and transformation steps and roles
ranger_recipe <-
recipe(formula = log_price ~ .,
data = house_training) %>%
step_date(date,
features = "month") %>%
# Make these evaluative variables, not included in modeling
update_role(all_of(c("id",
"date")),
new_role = "evaluative")
#define model
ranger_spec <-
rand_forest(mtry = 6,
min_n = 10,
trees = 200) %>%
set_mode("regression") %>%
set_engine("ranger")
#create workflow
ranger_workflow <-
workflow() %>%
add_recipe(ranger_recipe) %>%
add_model(ranger_spec)
#fit the model
set.seed(712) # for reproducibility - random sampling in random forest choosing number of variables
ranger_fit <- ranger_workflow %>%
fit(house_training)
# Create an explainer for the random forest model:
rf_explain <-
explain_tidymodels(
model = ranger_fit,
data = house_training %>% select(-log_price),
y = house_training %>% pull(log_price),
label = "rf"
)
## Preparation of a new explainer is initiated
## -> model label : rf
## -> data : 16210 rows 20 cols
## -> data : tibble converted into a data.frame
## -> target variable : 16210 values
## -> predict function : yhat.workflow will be used ( [33m default [39m )
## -> predicted values : No value for predict function target column. ( [33m default [39m )
## -> model_info : package tidymodels , ver. 0.1.2 , task regression ( [33m default [39m )
## -> predicted values : numerical, min = 5.057808 , mean = 5.665083 , max = 6.722883
## -> residual function : difference between y and yhat ( [33m default [39m )
## -> residuals : numerical, min = -0.3314416 , mean = 0.0004168269 , max = 0.2491147
## [32m A new explainer has been created! [39m
obs1 <- house_testing %>% slice(5377)
obs2 <- house_testing %>% slice(100)
obs3 <- house_testing %>% slice(2498)
obs1
obs2
obs3
# Price of new_obs's house - just to know because I can't think in logs
10^(obs1$log_price)
## [1] 1865000
# observation 1 house price is $186,5,000
10^(obs2$log_price)
## [1] 365000
# observation 2 house price is $365,000
10^(obs3$log_price)
## [1] 505000
# observation 3 house price is $505,000
#obs1 rf
bd_rf_obs1 <- predict_parts(explainer = rf_explain,
new_observation = obs1,
type = "break_down") #default
plot(bd_rf_obs1)
bd_rf_obs1
#obs2 rf
bd_rf_obs2 <- predict_parts(explainer = rf_explain,
new_observation = obs2,
type = "break_down") #default
plot(bd_rf_obs2)
bd_rf_obs2
#obs3 rf
bd_rf_obs3 <- predict_parts(explainer = rf_explain,
new_observation = obs3,
type = "break_down") #default
plot(bd_rf_obs3)
bd_rf_obs3
#obs 1
rf_shap1 <-predict_parts(explainer = rf_explain,
new_observation = obs1,
type = "shap",
B = 10 #number of reorderings - start small
)
plot(rf_shap1)
#obs 2
rf_shap2 <-predict_parts(explainer = rf_explain,
new_observation = obs2,
type = "shap",
B = 10 #number of reorderings - start small
)
plot(rf_shap2)
#obs 3
rf_shap3 <-predict_parts(explainer = rf_explain,
new_observation = obs3,
type = "shap",
B = 10 #number of reorderings - start small
)
plot(rf_shap3)
# NEED these two lines of code always!
# They make sure our explainer is defined correctly to use in the next step
#observation 1
set.seed(2)
model_type.dalex_explainer <- DALEXtra::model_type.dalex_explainer
predict_model.dalex_explainer <- DALEXtra::predict_model.dalex_explainer
lime_rf_1 <- predict_surrogate(explainer = rf_explain,
new_observation = obs1 %>%
select(-log_price),
n_features = 7,
n_permutations = 1000,
type = "lime")
## Warning: view does not contain enough variance to use quantile binning. Using
## standard binning instead.
## Warning: yr_renovated does not contain enough variance to use quantile binning.
## Using standard binning instead.
## Warning in gower_work(x = x, y = y, pair_x = pair_x, pair_y = pair_y, n =
## NULL, : skipping variable with zero or non-finite range.
lime_rf_1 %>%
select(model_r2, model_prediction, prediction) %>%
distinct()
plot(lime_rf_1) +
labs(x = "Variable")
## Warning in lime::plot_features(x, ...): NAs introduced by coercion
## Warning: Removed 1 rows containing missing values (position_stack).
#observation 2
set.seed(2)
model_type.dalex_explainer <- DALEXtra::model_type.dalex_explainer
predict_model.dalex_explainer <- DALEXtra::predict_model.dalex_explainer
lime_rf_2 <- predict_surrogate(explainer = rf_explain,
new_observation = obs2 %>%
select(-log_price),
n_features = 7,
n_permutations = 1000,
type = "lime")
## Warning: view does not contain enough variance to use quantile binning. Using
## standard binning instead.
## Warning: yr_renovated does not contain enough variance to use quantile binning.
## Using standard binning instead.
## Warning in gower_work(x = x, y = y, pair_x = pair_x, pair_y = pair_y, n =
## NULL, : skipping variable with zero or non-finite range.
lime_rf_2 %>%
select(model_r2, model_prediction, prediction) %>%
distinct()
plot(lime_rf_2) +
labs(x = "Variable")
#observation 3
set.seed(2)
model_type.dalex_explainer <- DALEXtra::model_type.dalex_explainer
predict_model.dalex_explainer <- DALEXtra::predict_model.dalex_explainer
lime_rf_3 <- predict_surrogate(explainer = rf_explain,
new_observation = obs3 %>%
select(-log_price),
n_features = 7,
n_permutations = 1000,
type = "lime")
## Warning: view does not contain enough variance to use quantile binning. Using
## standard binning instead.
## Warning: yr_renovated does not contain enough variance to use quantile binning.
## Using standard binning instead.
## Warning in gower_work(x = x, y = y, pair_x = pair_x, pair_y = pair_y, n =
## NULL, : skipping variable with zero or non-finite range.
lime_rf_3 %>%
select(model_r2, model_prediction, prediction) %>%
distinct()
plot(lime_rf_3) +
labs(x = "Variable")
You will use the airlines data from the SQL database that I used in the example in the tutorial. Be sure to include the chunk to connect to the database here. And, when you are finished, disconnect. You may need to reconnect throughout as it times out after a while.
Tasks:
con_air <- dbConnect(RMySQL::MySQL(),
dbname = "airlines",
host = "mdsr.cdc7tgkkqd0n.us-east-1.rds.amazonaws.com",
user = "mdsr_public",
password = "ImhsmflMDSwR")
dbListTables(con_air)
## [1] "airports" "carriers" "flights" "planes"
#lapply(dbListConnections(MySQL()), dbDisconnect)
flights <-
tbl(con_air, "flights") %>%
select(month, arr_delay, origin, dest, air_time, distance) %>%
head(100)
#by airport
flights_air <-
flights %>%
group_by(origin) %>%
summarize(n_flights = n(),
avg_length = mean(air_time),
avg_distance = mean(distance)) %>%
inner_join(tbl(con_air, "airports"),
by = c("origin" = "faa"))
## Warning in .local(conn, statement, ...): Decimal MySQL column 2 imported as
## numeric
## Warning in .local(conn, statement, ...): Decimal MySQL column 3 imported as
## numeric
air <-
tbl(con_air, "flights") %>%
head(100) %>%
group_by(origin) %>%
summarize(prop_late_over20 = mean(arr_delay > 20)) %>%
arrange(desc(prop_late_over20)) %>%
rename(
prop_late_airport = prop_late_over20) #%>%
#as.data.frame(air, row.names = TRUE)
airport_info <-
flights_air %>%
inner_join(air,
by = c("origin" = "origin")) %>%
select(name, n_flights, avg_distance, avg_length, prop_late_airport) %>%
arrange(desc(prop_late_airport))
#as.data.frame(flights_air, row.names = TRUE)
#by month
flights_mon <-
flights %>%
group_by(month) %>%
summarize(n_flights = n(),
avg_length = mean(air_time),
avg_distance = mean(distance))
mon <-
tbl(con_air, "flights") %>%
head(100) %>%
group_by(month) %>%
summarize(prop_late_over20 = mean(arr_delay > 20)) %>%
arrange(desc(prop_late_over20)) %>%
#as.data.frame(mon, row.names = TRUE) %>%
rename(
prop_late_month = prop_late_over20)
month_info <-
flights_mon %>%
inner_join(mon,
by = c("month" = "month")) %>%
select(month, n_flights, avg_distance, avg_length, prop_late_month) %>%
arrange(desc(prop_late_month))
#as.data.frame(flights_mon, row.names = TRUE)
con_air <- dbConnect(RMySQL::MySQL(),
dbname = "airlines",
host = "mdsr.cdc7tgkkqd0n.us-east-1.rds.amazonaws.com",
user = "mdsr_public",
password = "ImhsmflMDSwR")
dbListTables(con_air)
## [1] "airports" "carriers" "flights" "planes"
airport_info %>%
show_query()
## <SQL>
## Warning: Missing values are always removed in SQL.
## Use `mean(x, na.rm = TRUE)` to silence this warning
## This warning is displayed only once per session.
## SELECT `name`, `n_flights`, `avg_distance`, `avg_length`, `prop_late_airport`
## FROM (SELECT `LHS`.`origin` AS `origin`, `LHS`.`n_flights` AS `n_flights`, `LHS`.`avg_length` AS `avg_length`, `LHS`.`avg_distance` AS `avg_distance`, `LHS`.`name` AS `name`, `LHS`.`lat` AS `lat`, `LHS`.`lon` AS `lon`, `LHS`.`alt` AS `alt`, `LHS`.`tz` AS `tz`, `LHS`.`dst` AS `dst`, `LHS`.`city` AS `city`, `LHS`.`country` AS `country`, `RHS`.`prop_late_airport` AS `prop_late_airport`
## FROM (SELECT `LHS`.`origin` AS `origin`, `LHS`.`n_flights` AS `n_flights`, `LHS`.`avg_length` AS `avg_length`, `LHS`.`avg_distance` AS `avg_distance`, `RHS`.`name` AS `name`, `RHS`.`lat` AS `lat`, `RHS`.`lon` AS `lon`, `RHS`.`alt` AS `alt`, `RHS`.`tz` AS `tz`, `RHS`.`dst` AS `dst`, `RHS`.`city` AS `city`, `RHS`.`country` AS `country`
## FROM (SELECT `origin`, COUNT(*) AS `n_flights`, AVG(`air_time`) AS `avg_length`, AVG(`distance`) AS `avg_distance`
## FROM (SELECT *
## FROM (SELECT `month`, `arr_delay`, `origin`, `dest`, `air_time`, `distance`
## FROM `flights`) `dbplyr_001`
## LIMIT 100) `dbplyr_002`
## GROUP BY `origin`) `LHS`
## INNER JOIN `airports` AS `RHS`
## ON (`LHS`.`origin` = `RHS`.`faa`)
## ) `LHS`
## INNER JOIN (SELECT `origin`, `prop_late_over20` AS `prop_late_airport`
## FROM (SELECT *
## FROM (SELECT `origin`, AVG(`arr_delay` > 20.0) AS `prop_late_over20`
## FROM (SELECT *
## FROM `flights`
## LIMIT 100) `dbplyr_003`
## GROUP BY `origin`) `dbplyr_004`
## ORDER BY `prop_late_over20` DESC) `dbplyr_005`) `RHS`
## ON (`LHS`.`origin` = `RHS`.`origin`)
## ) `dbplyr_006`
## ORDER BY `prop_late_airport` DESC
month_info %>%
show_query()
## <SQL>
## SELECT `month`, `n_flights`, `avg_distance`, `avg_length`, `prop_late_month`
## FROM (SELECT `LHS`.`month` AS `month`, `LHS`.`n_flights` AS `n_flights`, `LHS`.`avg_length` AS `avg_length`, `LHS`.`avg_distance` AS `avg_distance`, `RHS`.`prop_late_month` AS `prop_late_month`
## FROM (SELECT `month`, COUNT(*) AS `n_flights`, AVG(`air_time`) AS `avg_length`, AVG(`distance`) AS `avg_distance`
## FROM (SELECT *
## FROM (SELECT `month`, `arr_delay`, `origin`, `dest`, `air_time`, `distance`
## FROM `flights`) `dbplyr_007`
## LIMIT 100) `dbplyr_008`
## GROUP BY `month`) `LHS`
## INNER JOIN (SELECT `month`, `prop_late_over20` AS `prop_late_month`
## FROM (SELECT *
## FROM (SELECT `month`, AVG(`arr_delay` > 20.0) AS `prop_late_over20`
## FROM (SELECT *
## FROM `flights`
## LIMIT 100) `dbplyr_009`
## GROUP BY `month`) `dbplyr_010`
## ORDER BY `prop_late_over20` DESC) `dbplyr_011`) `RHS`
## ON (`LHS`.`month` = `RHS`.`month`)
## ) `dbplyr_012`
## ORDER BY `prop_late_month` DESC
SELECT `name`, `n_flights`, `avg_distance`, `avg_length`, `prop_late_airport`
FROM (SELECT `LHS`.`origin` AS `origin`, `LHS`.`n_flights` AS `n_flights`, `LHS`.`avg_length` AS `avg_length`, `LHS`.`avg_distance` AS `avg_distance`, `LHS`.`name` AS `name`, `LHS`.`lat` AS `lat`, `LHS`.`lon` AS `lon`, `LHS`.`alt` AS `alt`, `LHS`.`tz` AS `tz`, `LHS`.`dst` AS `dst`, `LHS`.`city` AS `city`, `LHS`.`country` AS `country`, `RHS`.`prop_late_airport` AS `prop_late_airport`
FROM (SELECT `LHS`.`origin` AS `origin`, `LHS`.`n_flights` AS `n_flights`, `LHS`.`avg_length` AS `avg_length`, `LHS`.`avg_distance` AS `avg_distance`, `RHS`.`name` AS `name`, `RHS`.`lat` AS `lat`, `RHS`.`lon` AS `lon`, `RHS`.`alt` AS `alt`, `RHS`.`tz` AS `tz`, `RHS`.`dst` AS `dst`, `RHS`.`city` AS `city`, `RHS`.`country` AS `country`
FROM (SELECT `origin`, COUNT(*) AS `n_flights`, AVG(`air_time`) AS `avg_length`, AVG(`distance`) AS `avg_distance`
FROM (SELECT *
FROM (SELECT `month`, `arr_delay`, `origin`, `dest`, `air_time`, `distance`
FROM `flights`) `dbplyr_290`
LIMIT 100) `dbplyr_291`
GROUP BY `origin`) `LHS`
INNER JOIN `airports` AS `RHS`
ON (`LHS`.`origin` = `RHS`.`faa`)
) `LHS`
INNER JOIN (SELECT `origin`, `prop_late_over20` AS `prop_late_airport`
FROM (SELECT *
FROM (SELECT `origin`, AVG(`arr_delay` > 20.0) AS `prop_late_over20`
FROM (SELECT *
FROM `flights`
LIMIT 100) `dbplyr_292`
GROUP BY `origin`) `dbplyr_293`
ORDER BY `prop_late_over20` DESC) `dbplyr_294`) `RHS`
ON (`LHS`.`origin` = `RHS`.`origin`)
) `dbplyr_295`
ORDER BY `prop_late_airport` DESC
| name | n_flights | avg_distance | avg_length | prop_late_airport |
|---|---|---|---|---|
| Hartsfield Jackson Atlanta Intl | 2 | 569.5000 | 78.5000 | 1.0000 |
| Fort Lauderdale Hollywood Intl | 1 | 1119.0000 | 131.0000 | 1.0000 |
| Southwest Florida Intl | 1 | 1102.0000 | 129.0000 | 1.0000 |
| La Guardia | 2 | 1013.0000 | 141.5000 | 1.0000 |
| Metropolitan Oakland Intl | 1 | 2409.0000 | 280.0000 | 1.0000 |
| Norman Y Mineta San Jose Intl | 1 | 2570.0000 | 293.0000 | 1.0000 |
| Palm Beach Intl | 1 | 1028.0000 | 120.0000 | 1.0000 |
| Washington Dulles Intl | 1 | 542.0000 | 82.0000 | 1.0000 |
| Newark Liberty Intl | 3 | 607.3333 | 93.6667 | 0.6667 |
| John F Kennedy Intl | 4 | 1266.0000 | 160.0000 | 0.5000 |
SELECT `month`, `n_flights`, `avg_distance`, `avg_length`, `prop_late_month`
FROM (SELECT `LHS`.`month` AS `month`, `LHS`.`n_flights` AS `n_flights`, `LHS`.`avg_length` AS `avg_length`, `LHS`.`avg_distance` AS `avg_distance`, `RHS`.`prop_late_month` AS `prop_late_month`
FROM (SELECT `month`, COUNT(*) AS `n_flights`, AVG(`air_time`) AS `avg_length`, AVG(`distance`) AS `avg_distance`
FROM (SELECT *
FROM (SELECT `month`, `arr_delay`, `origin`, `dest`, `air_time`, `distance`
FROM `flights`) `dbplyr_296`
LIMIT 100) `dbplyr_297`
GROUP BY `month`) `LHS`
INNER JOIN (SELECT `month`, `prop_late_over20` AS `prop_late_month`
FROM (SELECT *
FROM (SELECT `month`, AVG(`arr_delay` > 20.0) AS `prop_late_over20`
FROM (SELECT *
FROM `flights`
LIMIT 100) `dbplyr_298`
GROUP BY `month`) `dbplyr_299`
ORDER BY `prop_late_over20` DESC) `dbplyr_300`) `RHS`
ON (`LHS`.`month` = `RHS`.`month`)
) `dbplyr_301`
ORDER BY `prop_late_month` DESC
| month | n_flights | avg_distance | avg_length | prop_late_month |
|---|---|---|---|---|
| 10 | 100 | 972.72 | 122.43 | 0.23 |
airport_df <- as.data.frame(airport_info, row.names = TRUE)
## Warning in .local(conn, statement, ...): Decimal MySQL column 2 imported as
## numeric
## Warning in .local(conn, statement, ...): Decimal MySQL column 3 imported as
## numeric
## Warning in .local(conn, statement, ...): Decimal MySQL column 4 imported as
## numeric
month_df <- as.data.frame(month_info, row.names = TRUE)
## Warning in .local(conn, statement, ...): Decimal MySQL column 2 imported as
## numeric
## Warning in .local(conn, statement, ...): Decimal MySQL column 3 imported as
## numeric
## Warning in .local(conn, statement, ...): Decimal MySQL column 4 imported as
## numeric
worst_10_airports <-
airport_df %>%
select(name, n_flights, avg_distance, avg_length, prop_late_airport) %>%
arrange(desc(prop_late_airport)) %>%
head(10)
ggplot(worst_10_airports, aes(x = name, y = prop_late_airport)) +
geom_col() +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
labs(title = "10 Worst Airports", x = "Airport", y = "Proportion of >20 mins late")
knitr::kable(head(airport_df[1:6, c(1,5)]), "simple")
| name | prop_late_airport |
|---|---|
| Washington Dulles Intl | 1 |
| Hartsfield Jackson Atlanta Intl | 1 |
| Fort Lauderdale Hollywood Intl | 1 |
| Southwest Florida Intl | 1 |
| La Guardia | 1 |
| Metropolitan Oakland Intl | 1 |
ggplot(airport_df, aes(x = name, y = n_flights)) +
geom_col() +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
labs(title = "Most Popular Airport", x = "Airport", y = "Number of Departures")
If you need to revisit the material, it is posted on the moodle page. I’ve tried to add all the necessary libraries to the top, but I may have missed something.
geom_sf() tasks:
library("maps")
library("lwgeom")
## Linking to liblwgeom 3.0.0beta1 r16016, GEOS 3.8.1, PROJ 6.3.1
states <- st_as_sf(map("state", plot = FALSE, fill = TRUE))
head(states)
states <- states %>%
mutate(area = as.numeric(st_area(states)))
ggplot(data = states) +
geom_sf(aes(fill = area)) +
scale_fill_viridis_c(trans = "sqrt", alpha = .4) +
coord_sf(xlim = c(-127, -63),
ylim = c(24, 51),
expand = FALSE)
states <- cbind(states, st_coordinates(st_centroid(states)))
## Warning in st_centroid.sf(states): st_centroid assumes attributes are constant
## over geometries of x
## Warning in st_centroid.sfc(st_geometry(x), of_largest_polygon =
## of_largest_polygon): st_centroid does not give correct centroids for longitude/
## latitude data
ggplot(data = states) +
geom_sf(aes(fill = area)) +
scale_fill_viridis_c(trans = "sqrt", alpha = .4) +
geom_point(data = states, aes(X, Y), size = 1) +
coord_sf(xlim = c(-127, -63), ylim = c(24, 51),
expand = FALSE)
counties <- st_as_sf(map("county", plot = FALSE, fill = TRUE))
counties <- subset(counties)
counties$area <- as.numeric(st_area(counties))
head(counties)
ggplot(data = states) +
geom_sf(aes(fill = area)) +
geom_sf(data = counties, fill = NA, color = gray(.5)) +
scale_fill_viridis_c(trans = "sqrt", alpha = .4) +
geom_point(data = states, aes(X, Y), size = 1) +
coord_sf(xlim = c(-127, -63), ylim = c(24, 51),
expand = FALSE)
ggplot(data = states) +
geom_sf(aes(fill = area)) +
geom_sf(data = counties, fill = NA, color = gray(.5)) +
scale_fill_viridis_c(trans = "sqrt", alpha = .4) +
geom_text(data = states, aes(X, Y, label = ID), size = 4) +
coord_sf(xlim = c(-125, -114), ylim = c(30, 42), expand = FALSE)
tidytext tasks:
Now you will try using tidytext on a new dataset about Russian Troll tweets.
These are tweets from Twitter handles that are connected to the Internet Research Agency (IRA), a Russian “troll factory.” The majority of these tweets were posted from 2015-2017, but the datasets encompass tweets from February 2012 to May 2018.
Three of the main categories of troll tweet that we will be focusing on are Left Trolls, Right Trolls, and News Feed. Left Trolls usually pretend to be BLM activists, aiming to divide the democratic party (in this context, being pro-Bernie so that votes are taken away from Hillary). Right trolls imitate Trump supporters, and News Feed handles are “local news aggregators,” typically linking to legitimate news.
For our upcoming analyses, some important variables are:
Variable documentation can be found on Github and a more detailed description of the dataset can be found in this fivethirtyeight article.
Because there are 12 datasets containing 2,973,371 tweets sent by 2,848 Twitter handles in total, we will be using three of these datasets (one from a Right troll, one from a Left troll, and one from a News Feed account).
troll_tweets <- read_csv("https://raw.githubusercontent.com/fivethirtyeight/russian-troll-tweets/master/IRAhandle_tweets_12.csv")
## Parsed with column specification:
## cols(
## .default = col_character(),
## external_author_id = col_double(),
## following = col_double(),
## followers = col_double(),
## updates = col_double(),
## post_type = col_logical(),
## retweet = col_double(),
## new_june_2018 = col_double(),
## alt_external_id = col_double(),
## tweet_id = col_double(),
## tco3_step1 = col_logical()
## )
## See spec(...) for full column specifications.
## Warning: 110978 parsing failures.
## row col expected actual file
## 14784 post_type 1/0/T/F/TRUE/FALSE RETWEET 'https://raw.githubusercontent.com/fivethirtyeight/russian-troll-tweets/master/IRAhandle_tweets_12.csv'
## 26336 post_type 1/0/T/F/TRUE/FALSE RETWEET 'https://raw.githubusercontent.com/fivethirtyeight/russian-troll-tweets/master/IRAhandle_tweets_12.csv'
## 27167 post_type 1/0/T/F/TRUE/FALSE RETWEET 'https://raw.githubusercontent.com/fivethirtyeight/russian-troll-tweets/master/IRAhandle_tweets_12.csv'
## 27168 post_type 1/0/T/F/TRUE/FALSE RETWEET 'https://raw.githubusercontent.com/fivethirtyeight/russian-troll-tweets/master/IRAhandle_tweets_12.csv'
## 27169 post_type 1/0/T/F/TRUE/FALSE RETWEET 'https://raw.githubusercontent.com/fivethirtyeight/russian-troll-tweets/master/IRAhandle_tweets_12.csv'
## ..... ......... .................. ....... .......................................................................................................
## See problems(...) for more details.
troll_tweets
troll_tweets <-
troll_tweets %>%
filter(language == "English")
dim(troll_tweets)
## [1] 175966 21
#Dimensions are 175966 by 21
library(ggplot2)
ggplot(troll_tweets, aes(x = region)) +
geom_bar()
ggplot(troll_tweets, aes(x = region, fill=account_category)) +
geom_bar()+
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
ggplot(troll_tweets, aes(x = region, fill=account_type)) +
geom_bar()+
theme(axis.text.x = element_text(angle = 60, hjust = 1))
troll_tweets_untoken <- troll_tweets %>%
unnest_tokens(word,content)
troll_tweets_untoken
#get rid of stopwords (the, and, etc.)
troll_tweets_cleaned <- troll_tweets_untoken %>%
anti_join(get_stopwords())
## Joining, by = "word"
troll_tweets_cleaned <- troll_tweets_cleaned %>%
filter(word != "http") %>%
filter(word != "https") %>%
filter(word != "t.co") %>%
filter(word != "rt") %>%
filter(word != "amp") %>%
filter(word != "t,co") %>%
filter(word != "amp") %>%
filter(word != (1:9))
troll_tweets_cleaned
troll_tweets_small <- troll_tweets_cleaned %>%
count(word) %>%
slice_max(order_by = n, n = 50) # 50 most occurring words
troll_tweets_small
# visualize the number of times the 50 top words appear
ggplot(troll_tweets_small,
aes(x = word, y = n)) +
geom_col() +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
# look at sentiment
sentiment <- get_sentiments("bing")
sentiment
# assign a sentiment to each word that has one associated
troll_tweets_sentiment <- troll_tweets_cleaned %>%
inner_join(sentiment,
by = c("word" = "word"))
troll_tweets_sentiment
# count the sentiments
troll_tweets_sentiment %>%
group_by(sentiment) %>%
count()
Be sure to remove the eval=FALSE!!!!
# make a wordcloud where the size of the word is based on the number of times the word appears across the tweets
troll_tweets_small %>%
with(wordcloud(word, n, max.words = 35))
troll_tweets_sentiment
# make a wordcloud colored by sentiment
troll_tweets_sentiment %>%
acast(word ~ sentiment) %>%
comparison.cloud(colors = c("red","blue"),
max.words = 35)
## Using sentiment as value column: use value.var to override.
## Aggregation function missing: defaulting to length
Task: